Notes:
Notes: A few bins at around age 70, 100, 111 have an abnormal amount entries with higher friend counts. Otherwise, younger facebook users tend to have more friends.
setwd('~/Dropbox/Projects/da/eda/lesson4/')
pf <- read.delim('pseudo_facebook.tsv')
library(ggthemes)
## Loading required package: ggplot2
theme_set(theme_economist(12))
## Warning: New theme missing the following elements: legend.box,
## panel.margin.x, panel.margin.y
qplot(age, friend_count, data = pf)
Response:
Notes:
library(ggplot2)
ggplot(aes(age, friend_count), data = pf) +
geom_point() +
xlim(13, 90)
## Warning: Removed 4906 rows containing missing values (geom_point).
summary(pf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 20.00 28.00 37.28 50.00 113.00
Notes:
ggplot(aes(age, friend_count), data = pf) +
geom_jitter(alpha = 1/20) +
xlim(13, 90)
## Warning: Removed 5199 rows containing missing values (geom_point).
Response: There’s a clearer concentration of users with high friend counts at around age 20, while the spike at age 69 still persists. ***
Notes:
?coord_trans
ggplot(aes(age, friend_count), data = pf) +
geom_point(alpha=1/20, position=position_jitter(h=0)) +
xlim(13, 90) +
coord_trans(y='sqrt')
## Warning: Removed 5183 rows containing missing values (geom_point).
Notes: Explore the relationship between friends initiated and age
ggplot(aes(age, friendships_initiated), data = pf) +
geom_point(alpha=1/20, position=position_jitter(h=0)) +
xlim(13, 90) +
coord_trans(y='sqrt')
## Warning: Removed 5161 rows containing missing values (geom_point).
Notes:
Notes:
# install.packages('dpylr')
library('dplyr')
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarize(age_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age <- arrange(pf.fc_by_age)
head(pf.fc_by_age)
## Source: local data frame [6 x 4]
##
## age friend_count_mean friend_count_median n
## (int) (dbl) (dbl) (int)
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
# Alternate method: pipe operator %>%
pf.fc_by_age <- pf %>%
group_by(age) %>%
summarize(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age)
head(pf.fc_by_age, 20)
## Source: local data frame [20 x 4]
##
## age friend_count_mean friend_count_median n
## (int) (dbl) (dbl) (int)
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
## 7 19 333.6921 157.0 4391
## 8 20 283.4991 135.0 3769
## 9 21 235.9412 121.0 3671
## 10 22 211.3948 106.0 3032
## 11 23 202.8426 93.0 4404
## 12 24 185.7121 92.0 2827
## 13 25 131.0211 62.0 3641
## 14 26 144.0082 75.0 2815
## 15 27 134.1473 72.0 2240
## 16 28 125.8354 66.0 2364
## 17 29 120.8182 66.0 1936
## 18 30 115.2080 67.5 1716
## 19 31 118.4599 63.0 1694
## 20 32 114.2800 63.0 1443
Create your plot!
ggplot(aes(age, friendships_initiated), data = pf) +
geom_point(alpha=1/20,
position=position_jitter(h=0),
color='#099DD9') +
xlim(13, 90) +
coord_trans(y='sqrt') +
geom_line(stat='summary', fun.y = mean) +
geom_line(stat='summary', fun.y = quantile, probs = .1,
linetype=2, color='#CC2127') +
geom_line(stat='summary', fun.y = quantile, probs = .5,
color='#CC2127') +
geom_line(stat='summary', fun.y = quantile, probs = .9,
linetype=2, color='#CC2127')
## Warning: Removed 4906 rows containing missing values (stat_summary).
## Warning: Removed 4906 rows containing missing values (stat_summary).
## Warning: Removed 4906 rows containing missing values (stat_summary).
## Warning: Removed 4906 rows containing missing values (stat_summary).
## Warning: Removed 5188 rows containing missing values (geom_point).
Notes: The median appears to be much less than the mean, indicating a skew towards lower friend counts than the mean suggests.
ggplot(aes(age, friendships_initiated), data = pf) +
coord_cartesian(xlim=c(13,70), ylim=c(0,1000)) +
geom_point(alpha=1/20,
position=position_jitter(h=0),
color='#099DD9') +
geom_line(stat='summary', fun.y = mean) +
geom_line(stat='summary', fun.y = quantile, probs = .1,
linetype=2, color='#CC2127') +
geom_line(stat='summary', fun.y = quantile, probs = .5,
color='#CC2127') +
geom_line(stat='summary', fun.y = quantile, probs = .9,
linetype=2, color='#CC2127')
ggplot(aes(age, friend_count_mean), data=pf.fc_by_age) +
geom_line()
Response:
See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.
Notes:
Notes:
cor(pf$age, pf$friend_count)
## [1] -0.02740737
# cor.test(pf$age, pf$friend_count, method='pearson')
# cor.test(pf$age, pf$friend_count)
# with(pf, cor.test(age, friend_count))
Look up the documentation for the cor.test function.
What’s the correlation between age and friend count? Round to three decimal places. Response: -0.027 ***
Notes:
with(subset(pf, age >= 13 & age <= 70), cor.test(age, friend_count))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1780220 -0.1654129
## sample estimates:
## cor
## -0.1717245
Notes:
Notes:
ggplot(aes(www_likes_received, likes_received), data=pf) +
geom_point() +
xlab('Likes received through desktop web') +
ylab('Likes received')
Notes:
ggplot(aes(www_likes_received, likes_received), data=pf) +
geom_point() +
xlim(0, quantile(pf$www_likes_received, .95)) +
ylim(0, quantile(pf$likes_received, .95)) +
geom_smooth(method='lm', color='#CC2127') +
xlab('Likes received through desktop web') +
ylab('Likes received')
## Warning: Removed 6075 rows containing missing values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).
What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.
cor.test(pf$www_likes_received, pf$likes_received)
##
## Pearson's product-moment correlation
##
## data: pf$www_likes_received and pf$likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
Response: 0.948 ***
Notes:
Notes:
# install.packages('alr3')
library(alr3)
## Loading required package: car
data(Mitchell)
?Mitchell
Create your plot!
ggplot(aes(Month, Temp), data=Mitchell) + geom_point()
Take a guess for the correlation coefficient for the scatterplot. 0
What is the actual correlation of the two variables? (Round to the thousandths place) 0.057
cor.test(Mitchell$Month, Mitchell$Temp)
##
## Pearson's product-moment correlation
##
## data: Mitchell$Month and Mitchell$Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08053637 0.19331562
## sample estimates:
## cor
## 0.05747063
Notes:
ggplot(aes(Month, Temp), data=Mitchell) +
geom_point() +
scale_x_discrete(breaks=seq(0,203,12))
What do you notice? Response: There seems to be a cyclical pattern corresponding to the seasons of the year.
Watch the solution video and check out the Instructor Notes! Notes:
Notes:
ggplot(aes(age, friend_count_mean), data=pf.fc_by_age) +
geom_line()
head(pf.fc_by_age)
## Source: local data frame [6 x 4]
##
## age friend_count_mean friend_count_median n
## (int) (dbl) (dbl) (int)
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
pf.fc_by_age[17:19,]
## Source: local data frame [3 x 4]
##
## age friend_count_mean friend_count_median n
## (int) (dbl) (dbl) (int)
## 1 29 120.8182 66.0 1936
## 2 30 115.2080 67.5 1716
## 3 31 118.4599 63.0 1694
pf$age_with_months <- with(pf, age + (1 - dob_month / 12))
Programming Assignment
pf.fc_by_age_months <- pf %>%
group_by(age_with_months) %>%
summarize(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age_with_months)
head(pf.fc_by_age_months, 20)
## Source: local data frame [20 x 4]
##
## age_with_months friend_count_mean friend_count_median n
## (dbl) (dbl) (dbl) (int)
## 1 13.16667 46.33333 30.5 6
## 2 13.25000 115.07143 23.5 14
## 3 13.33333 136.20000 44.0 25
## 4 13.41667 164.24242 72.0 33
## 5 13.50000 131.17778 66.0 45
## 6 13.58333 156.81481 64.0 54
## 7 13.66667 130.06522 75.5 46
## 8 13.75000 205.82609 122.0 69
## 9 13.83333 215.67742 111.0 62
## 10 13.91667 162.28462 71.0 130
## 11 14.00000 194.13115 105.0 122
## 12 14.08333 226.67568 106.0 111
## 13 14.16667 270.73611 146.0 144
## 14 14.25000 218.86131 132.0 137
## 15 14.33333 313.24000 148.5 150
## 16 14.41667 230.50000 123.0 160
## 17 14.50000 268.41892 150.5 148
## 18 14.58333 288.51309 153.0 191
## 19 14.66667 264.82927 192.0 164
## 20 14.75000 182.55621 103.0 169
ggplot(aes(age_with_months, friend_count_mean),
data=subset(pf.fc_by_age_months, age_with_months < 71)) +
geom_line()
Notes:
p1 <- ggplot(aes(age, friend_count_mean),
data=subset(pf.fc_by_age, age < 71)) +
geom_line() +
geom_smooth()
p2 <- ggplot(aes(age_with_months, friend_count_mean),
data=subset(pf.fc_by_age_months, age_with_months < 71)) +
geom_line() +
geom_smooth()
p3 <- ggplot(aes(round(age/5)*5, friend_count_mean),
data=subset(pf.fc_by_age, age < 71)) +
geom_line(stat='summary', fun.y=mean) +
geom_smooth()
library(gridExtra)
grid.arrange(p2, p1, p3)
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
Notes:
Reflection:
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!